LION KING MOVIE : Reviews Sentiment Analysis

Ashutosh Parida


convention followed in this notebook

Green cell: Graph/Analysis topic to be done
Blue cell: Inference from above graph/analysis

Importing Required Packages

In [243]:
# Usual packages 
import os
import json
import requests
import datetime
import time
import joblib
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from pandas.io.json import json_normalize 
from itertools import chain
In [239]:
# graph related packages
import cufflinks as cf
import plotly.offline
import matplotlib.pyplot as plt 

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)


from yellowbrick.text import TSNEVisualizer
from sklearn.feature_extraction.text import TfidfVectorizer


from wordcloud import WordCloud
from PIL import Image
from os import path
In [30]:
# text related packages
import spacy
# Load the language model
nlp = spacy.load('en_core_web_lg')

from spacy.lang.en.stop_words import STOP_WORDS

import textblob
from textblob import TextBlob


import re
In [117]:
# need for topic modelling
from sklearn import metrics
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
In [5]:
headers = {
 'Referer': 'https://www.rottentomatoes.com/m/the_lion_king_2019/reviews?type=user',\
 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, likeGecko) Chrome/74.0.3729.108 Safari/537.36',\
 'X-Requested-With': 'XMLHttpRequest',\
}


url = 'https://www.rottentomatoes.com/napi/movie/9057c2cf-7cab-317f-876f-e50b245ca76e/reviews/user'


payload = {
    'direction': 'next',
    'endCursor': '',
    'startCursor': ''
}
In [ ]:
### web scrapping Method

parent_path =os.getcwd()
## Storing all the json data in movie_rdata dir
reviews_path = parent_path +"\\movie_rdata"
os.chdir(reviews_path)

s = requests.Session()


i=0
while (i < 302):
    time.sleep(5)
    data=''
    r=''
    
    print(payload,"i=",i)
    
    r = s.get(url, headers=headers, params=payload) # GET Call
    data = r.json()
    #print(data)
    
    if(data['pageInfo']['hasNextPage']):
        next_endCursor=data['pageInfo']['endCursor']
        
    payload = {
        'direction': 'next',
        'endCursor': next_endCursor,
        'startCursor': ''
    }
    
    filename="page"+str(i)+".json"
    with open(filename, 'w') as json_file:
        json.dump(data, json_file)
    i=i+1
    
    json_file.close()
In [249]:
# Process data from the collected Dir and make a df
print(reviews_path)

total_files_processed=0

for review_file in os.listdir(reviews_path):
    if review_file.endswith(".json"):
        with open(review_file) as infile:
            #print(review_file)
            jdata=json.load(infile)
            if(total_files_processed>0):
                movie_df=movie_df.append(json_normalize(jdata['reviews']))
                #print("coming here",total_files_processed)
            else:
                movie_df=json_normalize(jdata['reviews'])
                #print("first time",total_files_processed)
          
            total_files_processed=total_files_processed+1

            
print("Total files processed=",total_files_processed)
os.chdir(parent_path)
C:\Users\trex\Downloads\ML Comp Practice\PHD\movie_rdata
Total files processed= 300
In [14]:
# resetting index as each page index it as 0-9 from json collection method
movie_df=movie_df.reset_index(drop=True)
In [15]:
movie_df.head(12)
Out[15]:
createDate displayImageUrl displayName hasProfanity hasSpoilers isSuperReviewer isVerified rating review score timeFromCreation updateDate user.accountLink user.displayName user.realm user.userId
0 2019-08-18T08:54:30.664Z None Joanne H False False False True STAR_5 I liked most that the animation made the anima... 5.0 1h ago 2019-08-18T08:54:30.890Z None Joanne H Fandango 2c73ed20-5b9f-41b3-a4fd-8dd3ff8bb20a
1 2019-08-18T08:03:49.380Z https://graph.facebook.com/v3.3/594379764/picture Frankie C False False False False STAR_5 Amazing! So realistic and incredible music 5.0 2h ago 2019-08-18T08:03:49.380Z /user/id/871398953 Frankie C RT 871398953
2 2019-08-18T07:13:32.422Z None jaycee False False False False STAR_5 Classic. Good remake. Loved it. Glover was out... 5.0 3h ago 2019-08-18T07:13:32.422Z None jaycee Fandango DD2453B0-37CE-4B47-A099-D15378FC310E
3 2019-08-18T07:06:08.698Z https://graph.facebook.com/v3.3/10000047937306... Peter A False False False False STAR_2 Nice animation/CGI but completely lacking the ... 2.0 3h ago 2019-08-18T07:06:46.610Z /user/id/906750266 Peter A RT 906750266
4 2019-08-18T06:38:46.892Z None Kevin M False False False True STAR_4 Good but go mainly for the sentimental value 4.0 3h ago 2019-08-18T06:38:46.892Z None Kevin M Fandango 21FE7D93-351C-43B5-833D-B1DB11F3FD6A
5 2019-08-18T05:34:19.360Z None Angelwolf False False False True STAR_4 The animation was really good 4.0 4h ago 2019-08-18T05:34:19.360Z None Angelwolf Fandango E62CDAD2-329A-4924-8455-35249C3BCC4D
6 2019-08-18T05:22:03.783Z None sheril False False False True STAR_5 i thought it was awesone. i really like the or... 5.0 5h ago 2019-08-18T05:22:03.783Z None sheril Fandango 8229f609-93a7-4c8e-8ec7-37b18fa65679
7 2019-08-18T05:14:42.958Z None Maggie L False False False True STAR_3 I think that they should’ve brought back more ... 3.0 5h ago 2019-08-18T05:14:42.958Z None Maggie L Fandango 8B9ED608-C9C6-4EAF-82A3-1DF9FFFD0EA4
8 2019-08-18T04:58:59.214Z None Bryan A False False False True STAR_5 Great, absolutely great! The animation was gr... 5.0 5h ago 2019-08-18T04:58:59.214Z None Bryan A Fandango C792B3A4-EC33-48C2-A911-A2AAB946B94F
9 2019-08-18T04:51:44.361Z None Bethany False False False True STAR_5 It was just like the first one. Very good anim... 5.0 5h ago 2019-08-18T04:51:44.361Z None Bethany Fandango 45c8dfaa-e20f-44df-8f01-445c0f055e42
10 2019-08-18T04:35:16.243Z None Alexander G False False False False STAR_3 Didn't make me feel anything unlike the original. 3.0 5h ago 2019-08-18T04:35:16.243Z /user/id/978203293 Alexander G RT 978203293
11 2019-08-18T04:10:11.153Z None Lori False False False True STAR_5 Well done! A great way to redo a classic. It f... 5.0 6h ago 2019-08-18T04:10:11.153Z None Lori Fandango 8ce21b61-d2b1-453e-ab78-fe0ba06088d5
In [16]:
movie_df.head()
Out[16]:
createDate displayImageUrl displayName hasProfanity hasSpoilers isSuperReviewer isVerified rating review score timeFromCreation updateDate user.accountLink user.displayName user.realm user.userId
0 2019-08-18T08:54:30.664Z None Joanne H False False False True STAR_5 I liked most that the animation made the anima... 5.0 1h ago 2019-08-18T08:54:30.890Z None Joanne H Fandango 2c73ed20-5b9f-41b3-a4fd-8dd3ff8bb20a
1 2019-08-18T08:03:49.380Z https://graph.facebook.com/v3.3/594379764/picture Frankie C False False False False STAR_5 Amazing! So realistic and incredible music 5.0 2h ago 2019-08-18T08:03:49.380Z /user/id/871398953 Frankie C RT 871398953
2 2019-08-18T07:13:32.422Z None jaycee False False False False STAR_5 Classic. Good remake. Loved it. Glover was out... 5.0 3h ago 2019-08-18T07:13:32.422Z None jaycee Fandango DD2453B0-37CE-4B47-A099-D15378FC310E
3 2019-08-18T07:06:08.698Z https://graph.facebook.com/v3.3/10000047937306... Peter A False False False False STAR_2 Nice animation/CGI but completely lacking the ... 2.0 3h ago 2019-08-18T07:06:46.610Z /user/id/906750266 Peter A RT 906750266
4 2019-08-18T06:38:46.892Z None Kevin M False False False True STAR_4 Good but go mainly for the sentimental value 4.0 3h ago 2019-08-18T06:38:46.892Z None Kevin M Fandango 21FE7D93-351C-43B5-833D-B1DB11F3FD6A

Adding target variable to the data

In [18]:
#### Rating score >3 is considered positive (0) and negative (1) otherwise
In [19]:
movie_df['targetSentiment']=[0 if x>3 else 1 for x in movie_df['score']]
In [20]:
pd.DataFrame(movie_df,columns=['targetSentiment','score']).head()
Out[20]:
targetSentiment score
0 0 5.0
1 0 5.0
2 0 5.0
3 1 2.0
4 0 4.0

EDA

In [21]:
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print(movie_df.info())
print("---+++----+++------+++----+++---SHAPE---+++----+++------+++----+++------+++----+++---+++")
print(movie_df.shape)
print("---+++----+++------+++----+++---COLUMNS---+++----+++------+++----+++----+++------+++----+++")
print(movie_df.columns)
---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 17 columns):
createDate          3000 non-null object
displayImageUrl     222 non-null object
displayName         2862 non-null object
hasProfanity        3000 non-null bool
hasSpoilers         3000 non-null bool
isSuperReviewer     3000 non-null bool
isVerified          3000 non-null bool
rating              3000 non-null object
review              3000 non-null object
score               3000 non-null float64
timeFromCreation    3000 non-null object
updateDate          3000 non-null object
user.accountLink    610 non-null object
user.displayName    2862 non-null object
user.realm          3000 non-null object
user.userId         3000 non-null object
targetSentiment     3000 non-null int64
dtypes: bool(4), float64(1), int64(1), object(11)
memory usage: 316.5+ KB
None
---+++----+++------+++----+++---SHAPE---+++----+++------+++----+++------+++----+++---+++
(3000, 17)
---+++----+++------+++----+++---COLUMNS---+++----+++------+++----+++----+++------+++----+++
Index(['createDate', 'displayImageUrl', 'displayName', 'hasProfanity',
       'hasSpoilers', 'isSuperReviewer', 'isVerified', 'rating', 'review',
       'score', 'timeFromCreation', 'updateDate', 'user.accountLink',
       'user.displayName', 'user.realm', 'user.userId', 'targetSentiment'],
      dtype='object')
In [42]:
movie_df['createDate']=pd.to_datetime(movie_df['createDate'])
movie_df['updateDate']=pd.to_datetime(movie_df['updateDate'])
In [22]:
movie_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 17 columns):
createDate          3000 non-null object
displayImageUrl     222 non-null object
displayName         2862 non-null object
hasProfanity        3000 non-null bool
hasSpoilers         3000 non-null bool
isSuperReviewer     3000 non-null bool
isVerified          3000 non-null bool
rating              3000 non-null object
review              3000 non-null object
score               3000 non-null float64
timeFromCreation    3000 non-null object
updateDate          3000 non-null object
user.accountLink    610 non-null object
user.displayName    2862 non-null object
user.realm          3000 non-null object
user.userId         3000 non-null object
targetSentiment     3000 non-null int64
dtypes: bool(4), float64(1), int64(1), object(11)
memory usage: 316.5+ KB
In [23]:
movie_df.isnull().sum()
Out[23]:
createDate             0
displayImageUrl     2778
displayName          138
hasProfanity           0
hasSpoilers            0
isSuperReviewer        0
isVerified             0
rating                 0
review                 0
score                  0
timeFromCreation       0
updateDate             0
user.accountLink    2390
user.displayName     138
user.realm             0
user.userId            0
targetSentiment        0
dtype: int64
In [24]:
print("---+++----+++------+++----+++---Null Inference---+++----+++------+++----+++------+++----+++---+++")
print("Accounlink/displayName/ImageUrl/AccountLink is having NULL values, which can be expected.")
---+++----+++------+++----+++---Null Inference---+++----+++------+++----+++------+++----+++---+++
Accounlink/displayName/ImageUrl/AccountLink is having NULL values, which can be expected.
Target sentiment class division
In [33]:
HAPPY=0
SAD=1

movie_df.groupby('targetSentiment').score.count().iplot(
    kind='bar',
    barmode='group', 
    title='Sentiment - Positive vs Negative ',
    linecolor='black',
    xTitle='Sentiment',
    yTitle='# Reviewers'
    )

pos_users=movie_df.groupby('targetSentiment').score.count()[0]
neg_users=movie_df.groupby('targetSentiment').score.count()[1]
pos_neg_split=np.round((neg_users/(neg_users+pos_users)),4)*100
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print("% Of people who did not like this movie",pos_neg_split)
---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++
% Of people who did not like this movie 27.6
Inference:
Class imbalance seen for target variable!
So precautions needs to be taken to handle the train data for modelling

Rating scores distribution
In [38]:
movie_df.score.value_counts().iplot(
    kind='bar',
    xTitle='rating',
    linecolor='black',
    yTitle='# Reviewers',
    title="Reviewer's Rating Distribution")
Inference:
Majority of the reviewers have rated it : 5
And, ratings suggest only 28% people have dis-liked the movie

Ratings given by Verified Users and Super Users
Verified Users: Users who bought tickets
Super Users : Users who have been regular and have thus attained special class
In [39]:
verifiedr_ratings = movie_df[movie_df['isVerified']==1]['targetSentiment'].value_counts()
not_verifiedr_ratings = movie_df[movie_df['isVerified']==0]['targetSentiment'].value_counts()

superr_ratings = movie_df[movie_df['isSuperReviewer']==1]['targetSentiment'].value_counts()
not_superr_ratings = movie_df[movie_df['isSuperReviewer']==0]['targetSentiment'].value_counts()

df1 = pd.DataFrame([verifiedr_ratings,not_verifiedr_ratings])
df2 = pd.DataFrame([superr_ratings,not_superr_ratings])

df1.index = ['Verified User','not Verified User']
df2.index = ['Super User','not Super User']

df1.iplot(kind='bar',barmode='stack',title='Ratings by Verified User Type')
df2.iplot(kind='bar',barmode='stack',title='Ratings by Super User Type')
Inference:
% Verified Users who dis-liked the movie: 24.7

And, only 1 Super reviewer saw the movie
% Not-Super users who dis-liked the movie: 27.5
Ratings(type) Trend across the timeline(days)
In [44]:
movie_trend=movie_df.copy(deep=True)
movie_trend.index = movie_trend['updateDate']
In [45]:
movie_trend.resample('D').mean().score.iplot(
    kind='bar',
    bins=50,
    xTitle='Date',
    linecolor='black',
    yTitle='mean(Rating)',
    title='Daywise Rating Distribution')
Inference:
Average rating across the days is : 4
So the movie on an average can be considered going good with the audience
Ratings(#) Trend across the timeline(days)
In [46]:
movie_trend.resample('D').count().score.iplot(
    kind='bar',
    bins=50,
    xTitle='Date',
    linecolor='black',
    yTitle='# Ratings',
    title='Rating Traffic Trend')
Inference:
People see movies over weekend and more traffic on Sunday being an universal off. Aug 3 and 4 are weekend. Hence the no. of reviews are high on Monday i.e. Aug 5
Similar trend was seen on next weekend, with ratings steadily rising near weekend and reaching local maximum on Monday, as the next day feed from users

Analysing Profanity and Spoiler and its relation with rating

In [54]:
print(movie_df.hasSpoilers.value_counts())
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print("\n% Of Users using Spoilers",round(movie_df.hasSpoilers.value_counts()[1]/(movie_df.hasSpoilers.value_counts()[0] +\
                                               movie_df.hasSpoilers.value_counts()[1]),4))
False    2998
True        2
Name: hasSpoilers, dtype: int64
---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++

% Of Users using Spoilers 0.0007
In [53]:
print(movie_df.hasProfanity.value_counts())
print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print("\n% Of Users using Profanity",round(movie_df.hasProfanity.value_counts()[1]/(movie_df.hasProfanity.value_counts()[0] +\
                                               movie_df.hasProfanity.value_counts()[1]),4))
False    2968
True       32
Name: hasProfanity, dtype: int64
---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++

% Of Users using Profanity 0.0107

Profanity relation with Ratings
Profanity trend across Timeline(days)
In [49]:
movie_trend[movie_trend['hasProfanity']==True].score.sort_index(axis=0).iplot(
    kind='bar',
    xTitle='Date',
    linecolor='black',
    yTitle='Ratings',
    title='Profanity vs Rating')
Inference:
Profanity has been used throught out the date range.There is no correlation with specific dates
All the reviews have been covered with Profanity. Its not inclined for specific rating.No 1:1 mapping
Analysis of profanity text will help in understanding the terms to keep for sentiment analysis. But here, percentage of users using profanity is very less.
Profanity traffic across Ratings
In [50]:
movie_trend[movie_trend['hasProfanity']==True].score.sort_index(axis=0).value_counts().iplot(
    kind='bar',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    title='#Rating Distribution For Profanity reviews')
Inference:
People can use bad/swear words showing displeasure as well as excitement
In [52]:
perc_prof_reviews=round(movie_df.hasProfanity.value_counts()[1]/(movie_df.hasProfanity.value_counts()[0] +\
                                               movie_df.hasProfanity.value_counts()[1]),4)

perc_prof_reviews=round((movie_df.hasProfanity.value_counts()[1]/movie_df['user.userId'].count())*100,2)

print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print("% of Users using profanity :"+ str(perc_prof_reviews)+"\nThis is very less. Can be ignored.")
---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++
% of Users using profanity :1.07
This is very less. Can be ignored.

EDA for text

Making copy of data df and dropping unnecessary columns
In [55]:
total_data=movie_df.copy(deep=True)
In [56]:
total_data.columns.values
Out[56]:
array(['createDate', 'displayImageUrl', 'displayName', 'hasProfanity',
       'hasSpoilers', 'isSuperReviewer', 'isVerified', 'rating', 'review',
       'score', 'timeFromCreation', 'updateDate', 'user.accountLink',
       'user.displayName', 'user.realm', 'user.userId', 'targetSentiment'],
      dtype=object)
In [57]:
columns_except_review_score=['createDate', 'displayImageUrl', 'displayName', 'hasProfanity',
       'hasSpoilers', 'isSuperReviewer', 'isVerified', 'rating','timeFromCreation', 'updateDate', 'user.accountLink',
       'user.displayName', 'user.realm', 'user.userId']
In [58]:
total_data.drop(columns_except_review_score,inplace=True,axis=1)
In [59]:
total_data.head()
Out[59]:
review score targetSentiment
0 I liked most that the animation made the anima... 5.0 0
1 Amazing! So realistic and incredible music 5.0 0
2 Classic. Good remake. Loved it. Glover was out... 5.0 0
3 Nice animation/CGI but completely lacking the ... 2.0 1
4 Good but go mainly for the sentimental value 4.0 0

Plotting graphs BEFORE any cleaning of text

Word Count trend of Reviews
In [60]:
total_data['word_count'] = total_data['review'].apply(lambda x: len(str(x).split()))
In [237]:
total_data['word_count'].iplot(
    kind='violin',
    xTitle='review length',
    linecolor='black',
    yTitle='count',
    title='Word count distribution before cleaning text',colors='#604d9e')
Inference:
Median word count usage shows : 15
Review length of Reviews
In [236]:
total_data['review_len'] = total_data['review'].astype(str).apply(len)
total_data['review_len'].iplot(
    kind='violin',
    xTitle='review length',
    linecolor='black',
    yTitle='count',
    title='Review Length Distribution before cleaning text')
Inference:
Median Review length usage shows : 84
In [63]:
print("---+++----+++------+++----+++---Avg(ReviewLen)/Avg(#Words)---+++----+++------+++----+++------+++----+++---+++")

print("\nReview Length is greater than #words in it by factor of:",\
      round((total_data['review_len'].sum()/total_data['word_count'].sum()),2))
---+++----+++------+++----+++---Avg(ReviewLen)/Avg(#Words)---+++----+++------+++----+++------+++----+++---+++

Review Length is greater than #words in it by factor of: 5.52
Analysis of extra length reviews/having max words
In [64]:
print("---+++----+++------+++----+++---#Word>700---+++----+++------+++----+++------+++----+++---+++")
print(total_data[total_data['word_count']>700])
print("---+++----+++------+++----+++---ReviewLength>4000---+++----+++------+++----+++------+++----+++---+++")
print(total_data[total_data['review_len']>4000])
---+++----+++------+++----+++---#Word>700---+++----+++------+++----+++------+++----+++---+++
                                                 review  score  \
1854  If you could think which Disney classic seems ...    4.5   
1953  Man... where do I start with this one... I cam...    1.5   

      targetSentiment  word_count  review_len  
1854                0         717        4627  
1953                1         776        4246  
---+++----+++------+++----+++---ReviewLength>4000---+++----+++------+++----+++------+++----+++---+++
                                                 review  score  \
1854  If you could think which Disney classic seems ...    4.5   
1953  Man... where do I start with this one... I cam...    1.5   

      targetSentiment  word_count  review_len  
1854                0         717        4627  
1953                1         776        4246  
In [65]:
print(movie_df.iloc[1854])
print("---+++----+++------+++----+++---INFERENCE---+++----+++------+++----+++------+++----+++---+++")
print("\nHe liked the movie, so wrote lengthy summary")
createDate                           2019-08-02 17:04:56.101000+00:00
displayImageUrl     https://graph.facebook.com/v3.3/10000151338074...
displayName                                                    Kyle M
hasProfanity                                                    False
hasSpoilers                                                     False
isSuperReviewer                                                 False
isVerified                                                      False
rating                                                       STAR_4_5
review              If you could think which Disney classic seems ...
score                                                             4.5
timeFromCreation                                         Aug 02, 2019
updateDate                           2019-08-02 17:04:56.101000+00:00
user.accountLink                                   /user/id/907803058
user.displayName                                               Kyle M
user.realm                                                         RT
user.userId                                                 907803058
targetSentiment                                                     0
Name: 1854, dtype: object
---+++----+++------+++----+++---INFERENCE---+++----+++------+++----+++------+++----+++---+++

He liked the movie, so wrote lengthy summary
In [66]:
print(movie_df.iloc[1953])
print("---+++----+++------+++----+++---INFERENCE---+++----+++------+++----+++------+++----+++---+++")
print("\nHe dis-liked the movie, so wrote lengthy summary with some spoiler movie events!")
createDate                           2019-08-02 06:19:58.624000+00:00
displayImageUrl     https://graph.facebook.com/v3.3/90889160915874...
displayName                                                   Micah W
hasProfanity                                                    False
hasSpoilers                                                      True
isSuperReviewer                                                 False
isVerified                                                      False
rating                                                       STAR_1_5
review              Man... where do I start with this one... I cam...
score                                                             1.5
timeFromCreation                                         Aug 02, 2019
updateDate                           2019-08-02 06:19:58.624000+00:00
user.accountLink                                   /user/id/971893883
user.displayName                                              Micah W
user.realm                                                         RT
user.userId                                                 971893883
targetSentiment                                                     1
Name: 1953, dtype: object
---+++----+++------+++----+++---INFERENCE---+++----+++------+++----+++------+++----+++---+++

He dis-liked the movie, so wrote lengthy summary with some spoiler movie events!
N-Grams analysis

Top Unigram words before stem/lemma

In [67]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

def get_top_n_unigram(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_unigram(total_data['review'], 45)

df_uni = pd.DataFrame(common_words, columns = ['unigrams' , 'count'])
df_uni.groupby('unigrams').sum()['count'].\
    sort_values(ascending=False).iplot(kind='bar',yTitle='Count', linecolor='black',\
                                       title='Top 45 Unigrams in review before cleaning')
In [68]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_bigram(total_data['review'], 45)

df_bi = pd.DataFrame(common_words, columns = ['bigrams' , 'count'])

df_bi.groupby('bigrams').sum()['count'].\
    sort_values(ascending=False).iplot(kind='bar',yTitle='Count', linecolor='black',\
                                       title='Top 45 Bigrams in review before cleaning')
In [69]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_trigram(total_data['review'], 45)

df_tri = pd.DataFrame(common_words, columns = ['trigrams' , 'count'])


df_tri.groupby('trigrams').sum()['count'].\
    sort_values(ascending=False).iplot(kind='bar',yTitle='Count', linecolor='black',\
                                      title='Top 45 Trigrams in review before cleaning')
Inference:
N-grams is getting biased with presence of stop-words.
e.g. Top n-grams in 3 graphs shows : the (uni) --> the original (bi) --> to the original (tri)
Since the majority of reviews is positive sentiment, top-45 n-grams in all category shows positive words
### Plotting graph **AFTER** cleaning of text

Cleaning the review text

In [71]:
#print(list(STOP_WORDS))

from itertools import compress
from contractions import CONTRACTION_MAP

import unicodedata

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        #print(contraction)
        #print(type(contraction))
        match = contraction.group(0)
        #print(match)
        first_char = match[0]
        #print(first_char)
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    #print(f"The expand match is {expand_match} and text is {text}")
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

def normalize_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    #https://docs.python.org/2/library/unicodedata.html
    return text

def add_NOT_after_not(sentence=None):
    transformed = re.sub(r'\b(?:not|never|no)\b[\w\s]+[^\w\s]', 
       lambda match: re.sub(r'(\s+)(\w+)', r'\1NOT_\2', match.group(0)), 
       sentence,
       flags=re.IGNORECASE)
    
    return transformed

def tokenize_lemma_clean(dataframe,column_name):
    # text cleaning and pre-processing
   
    for index,row in dataframe.iterrows():
        #print(row[column_name],":::",row['score'])
    
        row[column_name]=row[column_name].lower()

        doc=nlp(row[column_name])

        # remove stop words
        clean_tokens1 = [token for token in doc if not token.is_stop]
        
        # may use STOP words for conjuction detection
        #remove words with <=2 chars
        clean_tokens1 = [token for token in clean_tokens1 if len(token.text)>2]

        #remove non-alpha
        clean_tokens2_bool = [token.is_alpha for token in clean_tokens1]
        #clean_tokens2_bool = [token.is_alpha for token in doc]
        clean_tokens2=list(compress(clean_tokens1,clean_tokens2_bool))

        #not_string = re.compile("not_*{7,}")
        #use only lemma
        clean_tokens3 = [token.lemma_ for token in clean_tokens2]

        #print(clean_tokens3)

        clean_text=' '.join(clean_tokens3)

        #dataframe.at[index,column_name]=clean_text
        dataframe.at[index,"clean_text1"]=clean_text

        
        
def pos_to_keep(dataframe,column_name,dest_column_name,x1=None,x2=None,x3=None,x4=None):
    # text cleaning and pre-processing
    for index,row in dataframe.iterrows():
        #print(row['STORY'],":::",row['SECTION'])

        doc=nlp(row[column_name])

        # remove stop words
        nn_tokens1 = [token.text for token in doc if ((token.text in ['like','love'])or(token.pos_ in [x1,x2,x3,x4]))]



        nn_text=' '.join(nn_tokens1)

        dataframe.at[index,dest_column_name]=nn_text
In [72]:
## keeping cleaned data in 'clean_text' column for safekeeping and comparing with original
In [73]:
total_data.insert(2,"clean_text","")

Spelling correction

In [ ]:
#total_data['clean_text'] = [TextBlob(text).correct() for text in total_data['review']]

Expand contractions

In [74]:
total_data['clean_text'] = [expand_contractions(text) for text in total_data['review']]
#total_data['clean_text'] = [expand_contractions(text) for text in total_data['clean_text']]

Normalize accented characters

In [75]:
total_data['clean_text'] = [normalize_accented_chars(text) for text in total_data['clean_text']]

Add NOT_ after not is encountered till punctuation

In [76]:
total_data['clean_text'] = [add_NOT_after_not(text) for text in total_data['clean_text']]

Lemmatization

In [79]:
## keeping the second stage of clean data in other column "clean_text1" for safekeeping and comparing with earlier clean data
## 'clean_text1' will have the final cleaned data after all these above process and lemmatization
In [78]:
tokenize_lemma_clean(total_data,"clean_text")
In [80]:
total_data.head(20)
Out[80]:
review score clean_text targetSentiment word_count review_len clean_text1
0 I liked most that the animation made the anima... 5.0 I liked most that the animation made the anima... 0 12 61 like animation animal look real
1 Amazing! So realistic and incredible music 5.0 Amazing! So realistic and incredible music 0 6 42 amazing realistic incredible music
2 Classic. Good remake. Loved it. Glover was out... 5.0 Classic. Good remake. Loved it. Glover was out... 0 8 54 classic good remake love glover outstanding
3 Nice animation/CGI but completely lacking the ... 2.0 Nice animation/CGI but completely lacking the ... 1 26 177 nice animation cgi completely lack disney styl...
4 Good but go mainly for the sentimental value 4.0 Good but go mainly for the sentimental value 0 8 44 good mainly sentimental value
5 The animation was really good 4.0 The animation was really good 0 5 29 animation good
6 i thought it was awesone. i really like the or... 5.0 i thought it was awesone. i really like the or... 0 33 177 think awesone like originalaty enjoy nail wait...
7 I think that they should’ve brought back more ... 3.0 I think that they shouldve brought back more o... 1 76 393 think bring original voice actor movie iconic ...
8 Great, absolutely great! The animation was gr... 5.0 Great, absolutely great! The animation was gr... 0 31 190 great absolutely great animation great story l...
9 It was just like the first one. Very good anim... 5.0 It was just like the first one. Very good anim... 0 13 68 like good animation cute movie
10 Didn't make me feel anything unlike the original. 3.0 Did not NOT_make NOT_me NOT_feel NOT_anything ... 1 8 49
11 Well done! A great way to redo a classic. It f... 5.0 Well done! A great way to redo a classic. It f... 0 26 146 great way redo classic feel national geographi...
12 Good movie. Just that the original animated w... 4.0 Good movie. Just that the original animated w... 0 13 78 good movie original animate well humble opinion
13 I didn’t like Beyoncé being nala. Bad acting. ... 3.0 I didnt like Beyonce being nala. Bad acting. I... 1 60 308 like beyonce nala bad act like keep thing like...
14 Such a beautiful story. We've loved the animat... 5.0 Such a beautiful story. We have loved the anim... 0 30 170 beautiful story love animate version year step...
15 great Flick, Kids had some good Laughs,. 4.0 great Flick, Kids had some good Laughs,. 0 7 40 great flick kid good laugh
16 It was great!! We loved it. 5.0 It was great!! We loved it. 0 6 27 great love
17 It’s The Lion King! This movie is absolutely g... 4.5 Its The Lion King! This movie is absolutely go... 0 43 223 lion king movie absolutely gorgeous little det...
18 I truly enjoyed this movie from the story to t... 5.0 I truly enjoyed this movie from the story to t... 0 16 77 truly enjoy movie story music great cast
19 All of the animals were so life-like and real!... 5.0 All of the animals were so life-like and real!... 0 19 100 animal life like real know wonderful love

Plotting graphs

Word Count trend of Reviews
In [238]:
total_data['word_count_clean'] = total_data['clean_text1'].apply(lambda x: len(str(x).split()))

total_data['word_count_clean'].iplot(
    kind='violin',
    xTitle='review length',
    linecolor='black',
    yTitle='count',
    title='Word count Distribution after cleaning text',colors='#604d9e')
Inference:
Median word count=6
Upper wishker = 24
so even if we take 3*whisker, so taking ~40 words we will cover 99.7% dataset
Review length of Reviews
In [234]:
total_data['review_len'] = total_data['clean_text1'].astype(str).apply(len)
total_data['review_len'].iplot(
    kind='violin',
    xTitle='review length',
    linecolor='black',
    yTitle='count',
    title='Review Length Distribution after cleaning text')
Inference:
Median Review length usage shows : 42
i.e. drops to almost half from original un-cleaned review length

Top Unigram words after text cleaning

In [82]:
common_words = get_top_n_unigram(total_data['clean_text1'], 45)
#for word, freq in common_words:
#    print(word, freq)
df_uni_clean = pd.DataFrame(common_words, columns = ['unigrams' , 'count'])
df_uni_clean.groupby('unigrams').sum()['count'].\
    sort_values(ascending=False).iplot(kind='bar',yTitle='Count', linecolor='black',\
                                       title='Top 45 Unigrams in review after text cleaning')

Top Bigram words text cleaning

In [83]:
common_words = get_top_n_bigram(total_data['clean_text1'], 45)
#for word, freq in common_words:
#    print(word, freq)
df_bi_clean = pd.DataFrame(common_words, columns = ['bigrams' , 'count'])

df_bi_clean.groupby('bigrams').sum()['count'].\
    sort_values(ascending=False).iplot(kind='bar',yTitle='Count', linecolor='black',\
                                       title='Top 45 Bigrams in review after text cleaning')

Top Trigram words after text cleaning

In [84]:
common_words = get_top_n_trigram(total_data['clean_text1'], 45)
#for word, freq in common_words:
#    print(word, freq)
df_tri_clean = pd.DataFrame(common_words, columns = ['trigrams' , 'count'])


df_tri_clean.groupby('trigrams').sum()['count'].\
    sort_values(ascending=False).iplot(kind='bar',yTitle='Count', linecolor='black',\
                                      title='Top 45 Trigrams in review after text cleaning')
Inference:
More meaningful words are displated now
Even tigrams now give a good sense of topic being discussed in reviews
Since most of the reviews are with positive sentiment, hence more positive topics seems to be discussed
Word Cloud representation of cleaned review text
In [85]:
all_words = ' '.join([text for text in total_data['clean_text1']])


lion_mask = np.array(Image.open("lion.jpg"))
wordcloud = WordCloud(background_color="white",width=5000, height=5000, random_state=21,\
                      max_font_size=110,mask=lion_mask,contour_width=3, contour_color='steelblue').generate(all_words)
plt.figure(figsize=(100, 70))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
Word Cloud representation of cleaned review text but for only Negative Reviews
In [86]:
negative_sent_text=total_data[total_data['targetSentiment']==1]['clean_text1']

all_words = ' '.join([text for text in negative_sent_text])

lion_mask = np.array(Image.open("lion.jpg"))
wordcloud = WordCloud(background_color="white",width=5000, height=5000, random_state=21,\
                      max_font_size=110,mask=lion_mask,contour_width=3, contour_color='orange').generate(all_words)
plt.figure(figsize=(100, 70))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()
Inference:
Nouns are replicated in both the pictures
Adverb,Adjective are the differenciating factors, hence will be a good feature for classification

POS Tags Analysis

Distribution of different POS
In [88]:
#!pip install textblob
from textblob import TextBlob
blob = TextBlob(str(total_data['clean_text1'].tolist()))
pos_df = pd.DataFrame(blob.tags, columns = ['word' , 'pos'])

#pos_df = pos_df[~pos_df['word'].str.isnumeric()]
pos_df = pos_df.pos.value_counts()[:20]
pos_df.iplot(
    kind='bar',
    xTitle='POS',
    yTitle='count', 
    title='Top 20 Part-of-speech tagging for review corpus')
Plotting the documents in vector space and classifying on target sentiment
In [241]:
tfidf = TfidfVectorizer()
docs = tfidf.fit_transform(total_data['clean_text1'])


tsne = TSNEVisualizer()
tsne.fit_transform(docs, total_data['targetSentiment'])
tsne.poof()
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
Keeping only adjective/adverb/verb in reviews and plotting the documents
In [93]:
pos_to_keep(total_data,"clean_text1","clean_adj_adv_verb_col","ADJ","ADV","VERB")
In [95]:
total_data.head()
Out[95]:
review score clean_text targetSentiment word_count review_len clean_text1 word_count_clean clean_adj_adv_verb_col
0 I liked most that the animation made the anima... 5.0 I liked most that the animation made the anima... 0 12 61 like animation animal look real 5 like look real
1 Amazing! So realistic and incredible music 5.0 Amazing! So realistic and incredible music 0 6 42 amazing realistic incredible music 4 amazing realistic incredible
2 Classic. Good remake. Loved it. Glover was out... 5.0 Classic. Good remake. Loved it. Glover was out... 0 8 54 classic good remake love glover outstanding 6 classic good love outstanding
3 Nice animation/CGI but completely lacking the ... 2.0 Nice animation/CGI but completely lacking the ... 1 26 177 nice animation cgi completely lack disney styl... 17 nice completely lack disney wonder original un...
4 Good but go mainly for the sentimental value 4.0 Good but go mainly for the sentimental value 0 8 44 good mainly sentimental value 4 good mainly sentimental
In [96]:
tfidf = TfidfVectorizer()
docs = tfidf.fit_transform(total_data['clean_adj_adv_verb_col'])

tsne = TSNEVisualizer()
tsne_results=tsne.fit_transform(docs, total_data['targetSentiment'])
tsne.poof()
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
Inference:
Documents are much more clearly separated with using Adj/Adv/verb
Separated islands and denser categories seen in comparison to earlier plot having all POS tags
This can be used in feature engineering later.
In [98]:
pos_to_keep(total_data,"clean_text1","clean_adj_col","ADJ")
In [242]:
total_data.head()
Out[242]:
review score clean_text targetSentiment word_count review_len clean_text1 word_count_clean clean_adj_adv_verb_col clean_adj_col clean_nn doc_topic_cluster_group tokenized_text rev_polarity
0 I liked most that the animation made the anima... 5.0 I liked most that the animation made the anima... 0 12 61 like animation animal look real 5 like look real like real like animation animal 0 [like, animation, animal, look, real] 0.200000
1 Amazing! So realistic and incredible music 5.0 Amazing! So realistic and incredible music 0 6 42 amazing realistic incredible music 4 amazing realistic incredible amazing realistic incredible music 0 [amazing, realistic, incredible, music] 0.555556
2 Classic. Good remake. Loved it. Glover was out... 5.0 Classic. Good remake. Loved it. Glover was out... 0 8 54 classic good remake love glover outstanding 6 classic good love outstanding classic good love outstanding remake love glover 2 [classic, good, remake, love, glover, outstand... 0.466667
3 Nice animation/CGI but completely lacking the ... 2.0 Nice animation/CGI but completely lacking the ... 1 26 177 nice animation cgi completely lack disney styl... 17 nice completely lack disney wonder original un... nice disney original undue kiddy real animation cgi style humour concentration viole... 0 [nice, animation, cgi, completely, lack, disne... 0.318750
4 Good but go mainly for the sentimental value 4.0 Good but go mainly for the sentimental value 0 8 44 good mainly sentimental value 4 good mainly sentimental good sentimental value 2 [good, mainly, sentimental, value] 0.225000
In [100]:
tfidf = TfidfVectorizer()
docs = tfidf.fit_transform(total_data['clean_adj_col'])


tsne = TSNEVisualizer()
tsne.fit_transform(docs, total_data['targetSentiment'])
tsne.poof()
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
'c' argument looks like a single numeric RGB or RGBA sequence, which should be avoided as value-mapping will have precedence in case its length matches with 'x' & 'y'.  Please use a 2-D array with a single row if you really want to specify the same RGB or RGBA value for all points.
Inference:
Documents are much more clearly separated with using only Adj
More separation seen and more denser categories seen in comparison to earlier plot
POS tags/text like ADJ can be used in feature engineering later.
Wordcloud of Adjectives in negative reviews
In [256]:
negative_sent_text_adj=total_data[total_data['targetSentiment']==1]['clean_adj_col']

all_words = ' '.join([text for text in negative_sent_text_adj])


lion_mask = np.array(Image.open("lion.jpg"))
wordcloud = WordCloud(background_color="white",width=5000, height=5000, random_state=21,\
                      max_font_size=110,mask=lion_mask,contour_width=3, contour_color='orange').generate(all_words)
plt.figure(figsize=(100, 70))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# love,like - keeping as default as its common and needs to be there
Wordcloud of Adjectives in positive reviews
In [200]:
pos_sent_text_adj=total_data[total_data['targetSentiment']==0]['clean_adj_col']

all_words = ' '.join([text for text in pos_sent_text_adj])

lion_mask = np.array(Image.open("lion.jpg"))
wordcloud = WordCloud(background_color="white",width=5000, height=5000, random_state=21,\
                      max_font_size=110,mask=lion_mask,contour_width=3, contour_color='steelblue').generate(all_words)
plt.figure(figsize=(100, 70))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()

# love,like - keeping as default as its common and needs to be there

Topic modelling

In [114]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=20000,
                                 min_df=0.05,
                                 use_idf=True, ngram_range=(1,4))

tfidf_matrix = tfidf_vectorizer.fit_transform(total_data['clean_text1'])



tfidf_vectorizer.get_feature_names()
n_components = 26
svd_model = TruncatedSVD(n_components=n_components, algorithm='randomized',n_iter=20,random_state=143)

svd_matrix = svd_model.fit(tfidf_matrix)



doc_topic_matrix = svd_matrix.transform(tfidf_matrix)

print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print(f"tfidf_matrix.shape : {tfidf_matrix.shape}")
print(f"svd_matrix.n_components : {svd_matrix.n_components}")

print(f"\n\nExplained Variance Ratio : {svd_matrix.explained_variance_ratio_}") 
print(f"\nTotal Explained Variance : {round(svd_matrix.explained_variance_ratio_.sum() * 100, 2)} %")
print(f"\nThe singular values are {svd_matrix.singular_values_}")
---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++
tfidf_matrix.shape : (3000, 35)
svd_matrix.n_components : 26


Explained Variance Ratio : [0.04555754 0.07951979 0.0723996  0.06199811 0.05077909 0.05058907
 0.04318302 0.04046402 0.03422669 0.0313779  0.03042582 0.03015827
 0.02900911 0.02754599 0.02741932 0.02608363 0.02549402 0.02448314
 0.02372075 0.02228126 0.02171828 0.02015949 0.01962178 0.01920888
 0.01856792 0.01796588]

Total Explained Variance : 89.4 %

The singular values are [20.45277695 14.03356209 13.46627574 12.35958568 11.28544113 11.12588406
 10.35259268  9.93583667  9.14418032  8.76718312  8.62630355  8.57904664
  8.42094408  8.20317367  8.1828065   7.99156614  7.88648055  7.73281409
  7.60716911  7.37346181  7.27950364  7.0159315   6.92060531  6.84567719
  6.73043928  6.62169459]

Picking the most important words in each topic

In [115]:
terms = tfidf_vectorizer.get_feature_names()
for i, comp in enumerate(svd_model.components_):
    #print(f"The component is {comp} and shape is {comp.shape}")
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:6]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0],end=" ")
    print("\n")
Topic 0: 
movie love original great good like 

Topic 1: 
love movie amazing great kid music 

Topic 2: 
movie great enjoy kid good think 

Topic 3: 
great story animation original remake music 

Topic 4: 
good like animal story real animation 

Topic 5: 
like animal amazing feel movie look 

Topic 6: 
amazing lion king lion king animal remake 

Topic 7: 
lion king lion king remake watch version 

Topic 8: 
animal real look animation beautiful enjoy 

Topic 9: 
enjoy kid remake version film amazing 

Topic 10: 
voice remake well version animate character 

Topic 11: 
version animate well story watch kid 

Topic 12: 
animation story music beautiful voice version 

Topic 13: 
well animation music lion version lion king 

Topic 14: 
voice kid character feel beautiful watch 

Topic 15: 
kid watch well remake animation story 

Topic 16: 
well story remake enjoy animal like 

Topic 17: 
beautiful remake version music enjoy like 

Topic 18: 
watch time think film well feel 

Topic 19: 
film feel scene disney see time 

Topic 20: 
music feel film scene animal watch 

Topic 21: 
feel version story animal enjoy voice 

Topic 22: 
think music real look character disney 

Topic 23: 
scene feel character song look version 

Topic 24: 
animal think see time scene voice 

Topic 25: 
see time look real well song 

Calculate optimal cluster of topics

In [118]:
Sum_of_squared_distances = []
K = range(1,8)
for k in K:
    kmeanModel = KMeans(n_clusters=k, random_state=143)
    kmeanModel.fit(doc_topic_matrix)
    Sum_of_squared_distances.append(kmeanModel.inertia_)
In [119]:
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 10}

plt.rc('font', **font)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
C:\Users\trex\Anaconda3\lib\site-packages\matplotlib\font_manager.py:1241: UserWarning:

findfont: Font family ['normal'] not found. Falling back to DejaVu Sans.

Inference:
Since target class is imbalanced and mostly people are talking positive. Hence the higher freq of words are positive in most of the topics
So the elbow curve also shows a smooth curve for elbow point
In [120]:
# Choosing the best possible elbow point
num_clusters = 4

km = KMeans(n_clusters=num_clusters)

km.fit(doc_topic_matrix)

clusters = km.labels_.tolist()

centers = km.cluster_centers_
print(f"the cluster centers are {centers}")

joblib.dump(km,  'doc_topic_cluster_best_K.pkl')
the cluster centers are [[ 2.15323864e-01 -1.19696331e-01 -1.07890292e-01  4.15721758e-02
   1.62396135e-02  8.24711565e-02  7.12831791e-02  2.10258369e-02
   1.50983564e-02  1.53301381e-02  1.87371879e-02  4.95222435e-03
   1.42219491e-02  7.82577819e-03  5.91540883e-03  1.49054200e-02
   6.71194563e-05  5.03448106e-03 -1.85355454e-03  5.74852098e-03
   9.92546552e-05 -5.82912044e-03  6.52495405e-03  3.69939797e-03
  -1.67172600e-03  3.41852460e-03]
 [ 5.09507455e-01  9.07959866e-03  3.14517210e-01  1.29163205e-01
  -8.88119229e-02  1.46562752e-02 -1.62612633e-02 -5.00605343e-03
  -5.36529787e-03  8.96694329e-04 -7.15441089e-03  5.79065505e-03
  -7.14219526e-03  3.82423735e-03  3.97010377e-03  1.00040793e-03
  -4.49713405e-03  8.15673126e-03 -6.86920805e-04 -5.43730255e-03
   7.81792936e-03  5.87558455e-04 -6.34017112e-03 -6.33583915e-03
  -1.83467563e-03  3.23102527e-03]
 [ 3.77658612e-01 -2.24028017e-01  1.25862862e-03 -1.89756445e-01
   3.19251397e-01 -2.73608779e-01 -6.36225572e-02 -4.14677692e-02
  -1.90776462e-02  1.70686764e-04 -1.73348565e-02  7.60379944e-04
  -7.58848088e-03  9.69772540e-03 -9.98150404e-04 -3.55221871e-03
   8.12901720e-03  6.86286756e-03  1.70633516e-03 -2.62607645e-03
  -7.09718340e-03  2.00320683e-03 -1.30753810e-04 -1.79065210e-03
   4.56542588e-03 -1.24405336e-03]
 [ 4.46131615e-01  4.45729903e-01 -2.58704021e-01 -3.35535755e-03
   1.01293161e-02 -2.75150193e-02 -3.17343984e-02 -3.20392047e-02
   8.08980006e-03  1.07121421e-02  6.02451246e-03 -6.86670754e-03
   8.25747056e-03 -5.42624674e-03  5.83391619e-03  5.17044228e-03
   3.39523635e-03 -2.74910997e-03  6.56112740e-03 -2.06117650e-04
   7.16136598e-03 -6.14377144e-03  3.06155996e-03 -2.01798445e-04
   8.32397173e-03  5.12814266e-04]]
Out[120]:
['doc_topic_cluster_best_K.pkl']
In [121]:
clusters = km.labels_.tolist()
total_data['doc_topic_cluster_group'] = clusters
In [244]:
total_data.head()
Out[244]:
review score clean_text targetSentiment word_count review_len clean_text1 word_count_clean clean_adj_adv_verb_col clean_adj_col clean_nn doc_topic_cluster_group tokenized_text rev_polarity
0 I liked most that the animation made the anima... 5.0 I liked most that the animation made the anima... 0 12 61 like animation animal look real 5 like look real like real like animation animal 0 [like, animation, animal, look, real] 0.200000
1 Amazing! So realistic and incredible music 5.0 Amazing! So realistic and incredible music 0 6 42 amazing realistic incredible music 4 amazing realistic incredible amazing realistic incredible music 0 [amazing, realistic, incredible, music] 0.555556
2 Classic. Good remake. Loved it. Glover was out... 5.0 Classic. Good remake. Loved it. Glover was out... 0 8 54 classic good remake love glover outstanding 6 classic good love outstanding classic good love outstanding remake love glover 2 [classic, good, remake, love, glover, outstand... 0.466667
3 Nice animation/CGI but completely lacking the ... 2.0 Nice animation/CGI but completely lacking the ... 1 26 177 nice animation cgi completely lack disney styl... 17 nice completely lack disney wonder original un... nice disney original undue kiddy real animation cgi style humour concentration viole... 0 [nice, animation, cgi, completely, lack, disne... 0.318750
4 Good but go mainly for the sentimental value 4.0 Good but go mainly for the sentimental value 0 8 44 good mainly sentimental value 4 good mainly sentimental good sentimental value 2 [good, mainly, sentimental, value] 0.225000
In [128]:
doc_cluster_df = pd.DataFrame(total_data)

print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print("No. of docs in each cluster id")
doc_cluster_df['doc_topic_cluster_group'].value_counts()
---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++
No. of docs in each cluster id
Out[128]:
0    1677
1     537
3     415
2     371
Name: doc_topic_cluster_group, dtype: int64

Fetching the most frequent words among each cluster

In [131]:
from itertools import chain
doc_cluster_df['tokenized_text'] = [text.split(' ') for text in doc_cluster_df['clean_text1']]
grouped_text = doc_cluster_df.groupby('doc_topic_cluster_group')['tokenized_text']
frequent_words_df = pd.DataFrame(columns={"values", "counts", "cluster_id"})


for num in range(num_clusters):
    values, counts = np.unique(list(chain.from_iterable(grouped_text.get_group(num))), return_counts=True)
    sorted_indices = np.argsort(-counts)
    frequent_words_df = frequent_words_df.append({"values":values[sorted_indices], "counts":counts[sorted_indices], "cluster_id": num}, ignore_index=True)
In [132]:
frequent_words_df.head()
Out[132]:
counts cluster_id values
0 [682, 497, 485, 269, 246, 246, 242, 234, 227, ... 0 [original, movie, like, lion, king, animal, re...
1 [581, 369, 95, 74, 47, 44, 43, 41, 40, 31, 29,... 1 [movie, great, original, love, like, enjoy, fa...
2 [418, 226, 129, 62, 42, 40, 38, 35, 33, 33, 32... 2 [good, movie, original, like, think, remake, v...
3 [487, 168, 69, 40, 39, 39, 30, 28, 27, 23, 23,... 3 [love, movie, original, amazing, great, animal...
In [150]:
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 35}

plt.rc('font', **font)

fig = plt.figure(figsize=(20,50))
plt.subplot(2,2,1)
plt.xlabel("Topic 0",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[0,'values'][:20], frequent_words_df.loc[0,'counts'][:20])
plt.gca().invert_yaxis()


plt.subplot(2,2,2)
plt.xlabel("Topic 1",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[1,'values'][:20], frequent_words_df.loc[1,'counts'][:20])
plt.gca().invert_yaxis()

plt.subplot(2,2,3)
plt.xlabel("Topic 2",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[2,'values'][:20], frequent_words_df.loc[2,'counts'][:20])
plt.gca().invert_yaxis()


plt.subplot(2,2,4)
plt.xlabel("Topic 3",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[3,'values'][:20], frequent_words_df.loc[3,'counts'][:20])
plt.gca().invert_yaxis()
Inference:
Since target class is imbalanced and mostly people are talking positive. Hence, the higher freq of words are positive in most of the topics
Thus topic are revolving around all these positive words
e.g.
Topic 0 : Loved the remake of Original Lion King, animal animation and voice over is amazing
Topic 1 : Family movie, visual and graphics loved by kids

Topic modelling for Negative Reviews

In [179]:
#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=20000,
                                 min_df=0.05,
                                 use_idf=True, ngram_range=(1,4))

neg_senti_df=total_data[total_data['targetSentiment']==1].copy(deep=True)

tfidf_matrix = tfidf_vectorizer.fit_transform(neg_senti_df['clean_text1'])



tfidf_vectorizer.get_feature_names()
n_components = 26
svd_model = TruncatedSVD(n_components=n_components, algorithm='randomized',n_iter=20,random_state=143)

svd_matrix = svd_model.fit(tfidf_matrix)



doc_topic_matrix = svd_matrix.transform(tfidf_matrix)

print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print(f"tfidf_matrix.shape : {tfidf_matrix.shape}")
print(f"svd_matrix.n_components : {svd_matrix.n_components}")

print(f"\n\nExplained Variance Ratio : {svd_matrix.explained_variance_ratio_}") 
print(f"\nTotal Explained Variance : {round(svd_matrix.explained_variance_ratio_.sum() * 100, 2)} %")
print(f"\nThe singular values are {svd_matrix.singular_values_}")
---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++
tfidf_matrix.shape : (828, 44)
svd_matrix.n_components : 26


Explained Variance Ratio : [0.0436999  0.07429024 0.05765958 0.0420238  0.04190451 0.03936173
 0.03878174 0.03306398 0.03198087 0.02958898 0.02796077 0.02704614
 0.02641612 0.02493091 0.02481707 0.02366273 0.02286117 0.02209021
 0.02184397 0.02143101 0.0208044  0.02039033 0.01885433 0.01861359
 0.01738243 0.01713736]

Total Explained Variance : 78.86 %

The singular values are [10.5057318   6.97660396  6.20895615  5.26835308  5.22332478  5.06585021
  5.02867991  4.64456289  4.5653403   4.38921494  4.26672126  4.19632722
  4.14759715  4.03113484  4.02045236  3.9252676   3.85827738  3.79330738
  3.77239683  3.7366577   3.68050724  3.64408793  3.50544929  3.48436755
  3.36445383  3.34032897]
In [183]:
terms = tfidf_vectorizer.get_feature_names()
for i, comp in enumerate(svd_model.components_):
    #print(f"The component is {comp} and shape is {comp.shape}")
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:6]
    print("Topic "+str(i)+": ")
    for t in sorted_terms:
        print(t[0],end=" ")
    print("\n")
Topic 0: 
movie original like good well voice 

Topic 1: 
movie good bad leave think beyonce 

Topic 2: 
original movie well new way change 

Topic 3: 
good well cartoon version great cgi 

Topic 4: 
like well cartoon good version feel 

Topic 5: 
well cartoon version lion king animate 

Topic 6: 
voice well actor great bad character 

Topic 7: 
remake good voice like cartoon version 

Topic 8: 
remake great like animation character animal 

Topic 9: 
version animate feel lack film watch 

Topic 10: 
feel remake great animation film feel like 

Topic 11: 
film lack bad well remake scene 

Topic 12: 
film watch bad voice look cartoon 

Topic 13: 
bad song cartoon new beyonce change 

Topic 14: 
scene animate bad character think song 

Topic 15: 
bad feel animation animal think real 

Topic 16: 
animal scene song cartoon change voice 

Topic 17: 
story animate new animation song character 

Topic 18: 
watch think song new change story 

Topic 19: 
character cartoon feel way visual lack 

Topic 20: 
watch new scene story lack animation 

Topic 21: 
think version cartoon film scene new 

Topic 22: 
time beyonce character animation story cartoon 

Topic 23: 
beautiful visual way version love story 

Topic 24: 
disney beautiful love animation scene live 

Topic 25: 
animation song leave character live film 

In [184]:
Sum_of_squared_distances = []
K = range(1,8)
for k in K:
    kmeanModel = KMeans(n_clusters=k, random_state=143)
    kmeanModel.fit(doc_topic_matrix)
    Sum_of_squared_distances.append(kmeanModel.inertia_)
    
    
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 10}

plt.rc('font', **font)
plt.plot(K, Sum_of_squared_distances, 'bx-')
plt.xlabel('k')
plt.ylabel('Sum_of_squared_distances')
plt.title('Elbow Method For Optimal k')
plt.show()
In [186]:
# Choosing the best possible elbow point
num_clusters = 5

km = KMeans(n_clusters=num_clusters)

km.fit(doc_topic_matrix)

clusters = km.labels_.tolist()

centers = km.cluster_centers_
print(f"the cluster centers are {centers}")

joblib.dump(km,  'doc_topic_cluster_best_K1.pkl')

clusters = km.labels_.tolist()
neg_senti_df['doc_topic_cluster_group'] = clusters
the cluster centers are [[ 4.18669538e-01 -2.43448655e-02 -2.26343865e-01 -7.74442312e-02
  -2.64387110e-01  1.98655139e-01 -8.59381929e-02  1.19823814e-02
  -1.93866269e-01 -8.74070963e-02 -1.11378629e-01 -6.17237289e-02
  -2.70438695e-02  1.52090892e-02 -8.21504942e-03  3.72347398e-03
  -7.20455090e-02  2.54790246e-02 -3.12028365e-02  1.30068082e-02
  -8.65640997e-03 -1.13466670e-03 -1.48056561e-02  2.55017222e-03
  -1.84488420e-02  4.70969029e-03]
 [ 4.12293596e-01 -3.63455602e-01  2.51063729e-01 -2.20034974e-02
   4.50595235e-03 -7.16807027e-03  4.26562761e-02  5.20096446e-03
  -9.22354159e-03 -2.33331247e-02  1.16021821e-02 -6.25173917e-03
   5.18505144e-04 -8.06567703e-03 -9.54547201e-04 -1.94889990e-03
  -1.17931078e-04 -2.25980452e-03  1.80595161e-03 -3.33992133e-03
  -1.51004084e-02 -1.38181354e-02  1.08092605e-03  1.26874893e-02
   8.05043823e-03 -5.17974071e-03]
 [ 3.85487008e-01  4.03206894e-02 -7.05921400e-02  2.96108731e-01
   1.11658909e-01 -1.74749151e-01 -2.07378115e-01  4.32684462e-02
  -4.23670490e-02 -4.36262766e-02 -1.47892708e-02 -8.88150418e-03
  -8.25176056e-03  3.59779454e-03 -5.41482293e-03  1.73089401e-02
  -2.67355330e-03  2.42370842e-02 -1.34056052e-02 -8.66245732e-04
   5.78295193e-03 -1.81966457e-02 -1.65950392e-02  1.95245758e-03
   3.95579795e-03 -3.07708638e-05]
 [ 1.95104409e-01 -5.07564721e-02 -1.27060190e-01  1.44347177e-02
   1.59867649e-02  1.01972436e-02  4.87931733e-02 -2.87144794e-02
   4.79856308e-02  2.22291838e-02  1.82149220e-02  9.93965248e-03
   5.68014423e-03  1.04195949e-02  7.38419351e-03 -3.71600899e-03
   1.46775252e-02 -1.19400847e-02  1.28616061e-02 -9.06434561e-03
   1.62161993e-03  1.24939090e-02  1.09176493e-02  5.28488897e-03
   3.65899066e-03  2.04260565e-03]
 [ 4.93698118e-01  3.36014394e-01  1.44097864e-01 -6.32979813e-02
   1.16869540e-02  3.11111840e-02  5.52229298e-02  4.05828980e-04
   3.07761623e-02  3.78353382e-02  1.16165833e-02  1.37690910e-02
   1.66463281e-02 -7.57464244e-03  3.71166512e-03  6.99964896e-03
   6.94527816e-03 -1.11539618e-02  6.53807856e-03  3.25632129e-03
   1.94363664e-03  2.07652883e-04  1.01531678e-02  2.09647058e-03
  -3.04803635e-04 -2.46366745e-03]]
In [187]:
doc_cluster_df = pd.DataFrame(neg_senti_df)

print("---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++")
print("No. of docs in each cluster id")
doc_cluster_df['doc_topic_cluster_group'].value_counts()
---+++----+++------+++----+++---INFO---+++----+++------+++----+++------+++----+++---+++
No. of docs in each cluster id
Out[187]:
3    412
4    131
1    113
2     97
0     75
Name: doc_topic_cluster_group, dtype: int64
In [188]:
doc_cluster_df['tokenized_text'] = [text.split(' ') for text in doc_cluster_df['clean_text1']]
grouped_text = doc_cluster_df.groupby('doc_topic_cluster_group')['tokenized_text']
frequent_words_df = pd.DataFrame(columns={"values", "counts", "cluster_id"})


for num in range(num_clusters):
    values, counts = np.unique(list(chain.from_iterable(grouped_text.get_group(num))), return_counts=True)
    sorted_indices = np.argsort(-counts)
    frequent_words_df = frequent_words_df.append({"values":values[sorted_indices], "counts":counts[sorted_indices], "cluster_id": num}, ignore_index=True)
In [189]:
frequent_words_df.head()
Out[189]:
counts cluster_id values
0 [95, 95, 82, 77, 47, 44, 38, 37, 34, 31, 31, 2... 0 [king, lion, movie, original, remake, feel, fi...
1 [143, 44, 23, 17, 14, 10, 9, 9, 9, 8, 8, 7, 7,... 1 [original, well, like, feel, movie, lack, chan...
2 [114, 67, 44, 25, 23, 20, 18, 18, 17, 17, 17, ... 2 [good, movie, original, like, lack, voice, cha...
3 [133, 99, 88, 81, 75, 72, 59, 57, 56, 55, 54, ... 3 [like, voice, original, movie, feel, film, cha...
4 [203, 57, 25, 19, 17, 16, 15, 14, 13, 13, 11, ... 4 [movie, original, remake, like, animate, watch...
In [192]:
font = {'family' : 'normal',
        'weight' : 'bold',
        'size'   : 35}

plt.rc('font', **font)

fig = plt.figure(figsize=(20,50))
plt.subplot(3,2,1)
plt.xlabel("Topic 0",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[0,'values'][:20], frequent_words_df.loc[0,'counts'][:20])
plt.gca().invert_yaxis()


plt.subplot(3,2,2)
plt.xlabel("Topic 1",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[1,'values'][:20], frequent_words_df.loc[1,'counts'][:20])
plt.gca().invert_yaxis()

plt.subplot(3,2,3)
plt.xlabel("Topic 2",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[2,'values'][:20], frequent_words_df.loc[2,'counts'][:20])
plt.gca().invert_yaxis()


plt.subplot(3,2,4)
plt.xlabel("Topic 3",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[3,'values'][:20], frequent_words_df.loc[3,'counts'][:20])
plt.gca().invert_yaxis()


plt.subplot(3,2,5)
plt.xlabel("Topic 4",fontsize=24,color='steelblue',fontfamily='serif',weight='bold')
plt.barh(frequent_words_df.loc[4,'values'][:20], frequent_words_df.loc[4,'counts'][:20])
plt.gca().invert_yaxis()
Inference:
considering negative reviews, except Topic-0, which is mostly skewed towards positive, others are have negative elements.
Possible reasons for Topic-0 having positive words is because of people at times consider 3 as average/positive ratings, and we have marked it negative. Another possible reasons may be thwarted sentences.
As is seen in other topics, it seems people are talking about it being a remake of orginal movie and it lacks in some scenes, story, at times lack emotions.
e.g.
Topic 0 : Remake has Disney animation characters with live animal feel
Topic 1 : Liked original, but remake lacks emotion.

Sentiment polarity analysis at review level

In [245]:
def review_polarity(dataframe,column_name,dest_column_name):
    for index,row in dataframe.iterrows():
        review=TextBlob(row[column_name])

        dataframe.at[index,dest_column_name]=review.polarity
In [206]:
review_polarity(total_data,"clean_text1","rev_polarity")
In [228]:
total_data.head()
Out[228]:
review score clean_text targetSentiment word_count review_len clean_text1 word_count_clean clean_adj_adv_verb_col clean_adj_col clean_nn doc_topic_cluster_group tokenized_text rev_polarity
0 I liked most that the animation made the anima... 5.0 I liked most that the animation made the anima... 0 12 61 like animation animal look real 5 like look real like real like animation animal 0 [like, animation, animal, look, real] 0.200000
1 Amazing! So realistic and incredible music 5.0 Amazing! So realistic and incredible music 0 6 42 amazing realistic incredible music 4 amazing realistic incredible amazing realistic incredible music 0 [amazing, realistic, incredible, music] 0.555556
2 Classic. Good remake. Loved it. Glover was out... 5.0 Classic. Good remake. Loved it. Glover was out... 0 8 54 classic good remake love glover outstanding 6 classic good love outstanding classic good love outstanding remake love glover 2 [classic, good, remake, love, glover, outstand... 0.466667
3 Nice animation/CGI but completely lacking the ... 2.0 Nice animation/CGI but completely lacking the ... 1 26 177 nice animation cgi completely lack disney styl... 17 nice completely lack disney wonder original un... nice disney original undue kiddy real animation cgi style humour concentration viole... 0 [nice, animation, cgi, completely, lack, disne... 0.318750
4 Good but go mainly for the sentimental value 4.0 Good but go mainly for the sentimental value 0 8 44 good mainly sentimental value 4 good mainly sentimental good sentimental value 2 [good, mainly, sentimental, value] 0.225000
In [210]:
sent_polarity_df=pd.DataFrame(total_data,columns=['targetSentiment','rev_polarity'])
In [225]:
sent_polarity_df[sent_polarity_df['targetSentiment']==0]['rev_polarity'].iplot(
    kind='hist',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    color="blue",
    title='Polarity distribution of Positive reviews')
In [226]:
sent_polarity_df[sent_polarity_df['targetSentiment']==1]['rev_polarity'].iplot(
    kind='hist',
    xTitle='rating',
    linecolor='black',
    yTitle='count',
    color="orange",
    title='Polarity distribution of Negative reviews')
Inference:
Polarity distribution of Positive reviews have less mis-classification, than negative reviews. For negative reviews, there is high no. of misclassification happening in 0-0.5 polarity range

keeping a copy of total modifications and making a new df for going ahead of feature engineering

In [254]:
final_data=total_data.copy(deep=True)
In [255]:
final_data.columns.values
Out[255]:
array(['review', 'score', 'clean_text', 'targetSentiment', 'word_count',
       'review_len', 'clean_text1', 'word_count_clean',
       'clean_adj_adv_verb_col', 'clean_adj_col', 'clean_nn',
       'doc_topic_cluster_group', 'tokenized_text', 'rev_polarity'],
      dtype=object)
Ending EDA
In [ ]: